JUMP UMAP analysis with coSMicQC¶
This notebook analyzes JUMP data (cpg0000-jump-pilot) by leveraging UMAP and coSMicQC.
Outline¶
We focus on a single file from the JUMP dataset: BR00117012.sqlite.
This file is downloaded and prepared by CytoTable to form a single-cell Parquet file which includes all compartment feature data at the single-cell level.
We use coSMicQC to find and remove erroneous outlier data in order to prepare for UMAP analysis.
Afterwards, we use UMAP to demonstrate patterns within the data.
import logging
import pathlib
from typing import List, Optional
import cosmicqc
import holoviews
import hvplot.pandas
import numpy as np
import pandas as pd
import plotly.express as px
import pycytominer
import umap
from cytotable.convert import convert
from IPython.display import HTML, Image
from parsl.config import Config
from parsl.executors import ThreadPoolExecutor
from pyarrow import parquet
# set bokeh for visualizations with hvplot
hvplot.extension("bokeh")
# create a dir for images
pathlib.Path("./images").mkdir(exist_ok=True)
# avoid displaying plot export warnings
logging.getLogger("bokeh.io.export").setLevel(logging.ERROR)
# set the plate name for use throughout the notebook
example_plate = "BR00117012"
%opts magic unavailable (pyparsing cannot be imported)
%compositor magic unavailable (pyparsing cannot be imported)
Define utility functions for use within this notebook¶
def generate_umap_embeddings(
df_input: pd.DataFrame,
cols_metadata_to_exclude: List[str],
umap_n_components: int = 2,
random_state: Optional[int] = None,
) -> np.ndarray:
"""
Generates UMAP (Uniform Manifold Approximation and Projection)
embeddings for a given input dataframe,
excluding specified metadata columns.
Args:
df_input (pd.DataFrame]):
A dataframe which is expected to contain
numeric columns to be used for UMAP fitting.
cols_metadata_to_exclude (List[str]):
A list of column names representing
metadata columns that should be excluded
from the UMAP transformation.
umap_n_components: (int):
Number of components to use for UMAP.
Default = 2.
random_state (int):
Number to use for random state and
optional determinism.
Default = None (random each time)
Note: values besides None will turn
off parallelism for umap-learn, likely
meaning increased processing time.
Returns:
np.ndarray:
A dataframe containing the UMAP embeddings
with 2 components for each row in the input.
"""
# Make sure to reinitialize UMAP instance per plate
umap_fit = umap.UMAP(
n_components=umap_n_components,
random_state=random_state,
# set the default value if we didn't set a random_state
# otherwise set to 1 (umap-learn will override anyways).
# this is set to avoid warnings from umap-learn during
# processing.
n_jobs=-1 if random_state is None else 1,
)
# Fit UMAP and convert to pandas DataFrame
embeddings = umap_fit.fit_transform(
X=df_input[
[
col
for col in df_input.columns.tolist()
if col not in cols_metadata_to_exclude and "cqc." not in col
]
# select only numeric types from the dataframe
].select_dtypes(include=[np.number])
)
return embeddings
def plot_hvplot_scatter(
embeddings: np.ndarray,
title: str,
filename: str,
color_dataframe: Optional[pd.DataFrame] = None,
color_column: Optional[str] = None,
bgcolor: str = "black",
cmap: str = "plasma",
clabel: Optional[str] = None,
) -> holoviews.core.spaces.DynamicMap:
"""
Creates an outlier-focused scatter hvplot for viewing
UMAP embedding data with cosmicqc outliers coloration.
Args:
embeddings (np.ndarray]):
A numpy ndarray which includes
embedding data to display.
title (str):
Title for the UMAP scatter plot.
filename (str):
Filename which indicates where to export the
plot.
color_dataframe (pd.DataFrame):
A dataframe which includes data used for
color mapping within the plot. For example,
coSMicQC .is_outlier columns.
color_column (str):
Column name from color_dataframe to use
for coloring the scatter plot.
bgcolor (str):
Sets the background color of the plot.
cmap (str):
Sets the colormap used for the plot.
See here for more:
https://holoviews.org/user_guide/Colormaps.html
clabel (str):
Sets a label on the color map key displayed
horizontally. Defaults to None (no label).
Returns:
holoviews.core.spaces.DynamicMap:
A dynamic holoviews scatter plot which may be
displayed in a Jupyter notebook.
"""
# build a scatter plot through hvplot
plot = pd.DataFrame(embeddings).hvplot.scatter(
title=title,
x="0",
y="1",
alpha=0.1,
rasterize=True,
c=(
color_dataframe[color_column].astype(int).values
if color_dataframe is not None
else None
),
cnorm="linear",
cmap=cmap,
bgcolor=bgcolor,
height=700,
width=800,
clabel=clabel,
)
# export the plot
hvplot.save(obj=plot, filename=filename, center=False)
return plot
Merge single-cell compartment data into one table¶
# check if we already have prepared data
if not pathlib.Path(f"./{example_plate}.parquet").is_file():
# process BR00117012.sqlite using CytoTable to prepare data
merged_single_cells = convert(
source_path=(
"s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/workspace"
"/backend/2020_11_04_CPJUMP1/BR00117012/BR00117012.sqlite"
),
dest_path=f"./{example_plate}.parquet",
dest_datatype="parquet",
source_datatype="sqlite",
chunk_size=8000,
preset="cellprofiler_sqlite_cpg0016_jump",
# allows AWS S3 requests without login
no_sign_request=True,
# use explicit cache to avoid temp cache removal
local_cache_dir="./sqlite_s3_cache/",
parsl_config=Config(
executors=[ThreadPoolExecutor(label="tpe_for_jump_processing")]
),
sort_output=False,
)
else:
merged_single_cells = f"./{example_plate}.parquet"
# read only the metadata from parquet file
parquet.ParquetFile(merged_single_cells).metadata
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
Cell In[4], line 4
1 # check if we already have prepared data
2 if not pathlib.Path(f"./{example_plate}.parquet").is_file():
3 # process BR00117012.sqlite using CytoTable to prepare data
----> 4 merged_single_cells = convert(
5 source_path=(
6 "s3://cellpainting-gallery/cpg0000-jump-pilot/source_4/workspace"
7 "/backend/2020_11_04_CPJUMP1/BR00117012/BR00117012.sqlite"
8 ),
9 dest_path=f"./{example_plate}.parquet",
10 dest_datatype="parquet",
11 source_datatype="sqlite",
12 chunk_size=8000,
13 preset="cellprofiler_sqlite_cpg0016_jump",
14 # allows AWS S3 requests without login
15 no_sign_request=True,
16 # use explicit cache to avoid temp cache removal
17 local_cache_dir="./sqlite_s3_cache/",
18 parsl_config=Config(
19 executors=[ThreadPoolExecutor(label="tpe_for_jump_processing")]
20 ),
21 sort_output=False,
22 )
23 else:
24 merged_single_cells = f"./{example_plate}.parquet"
File ~/.cache/pypoetry/virtualenvs/cosmicqc-_xpxio4a-py3.11/lib/python3.11/site-packages/cytotable/convert.py:1451, in convert(source_path, dest_path, dest_datatype, source_datatype, metadata, compartments, identifying_columns, concat, join, joins, chunk_size, infer_common_schema, drop_null, data_type_cast_map, page_keys, sort_output, preset, parsl_config, **kwargs)
1449 # send sources to be written to parquet if selected
1450 if dest_datatype == "parquet":
-> 1451 output = _to_parquet(
1452 source_path=source_path,
1453 dest_path=dest_path,
1454 source_datatype=source_datatype,
1455 metadata=metadata,
1456 compartments=compartments,
1457 identifying_columns=identifying_columns,
1458 concat=concat,
1459 join=join,
1460 joins=joins,
1461 chunk_size=chunk_size,
1462 infer_common_schema=infer_common_schema,
1463 drop_null=drop_null,
1464 data_type_cast_map=data_type_cast_map,
1465 sort_output=sort_output,
1466 page_keys=cast(dict, page_keys),
1467 **kwargs,
1468 )
1470 return output
File ~/.cache/pypoetry/virtualenvs/cosmicqc-_xpxio4a-py3.11/lib/python3.11/site-packages/cytotable/convert.py:1055, in _to_parquet(source_path, dest_path, source_datatype, metadata, compartments, identifying_columns, concat, join, joins, chunk_size, infer_common_schema, drop_null, sort_output, page_keys, data_type_cast_map, **kwargs)
999 """
1000 Export data to parquet.
1001
(...)
1051 result.
1052 """
1054 # gather sources to be processed
-> 1055 sources = _gather_sources(
1056 source_path=source_path,
1057 source_datatype=source_datatype,
1058 targets=(
1059 list(metadata) + list(compartments)
1060 if metadata is not None and compartments is not None
1061 else []
1062 ),
1063 **kwargs,
1064 )
1066 # expand the destination path
1067 expanded_dest_path = _expand_path(path=dest_path)
File ~/.cache/pypoetry/virtualenvs/cosmicqc-_xpxio4a-py3.11/lib/python3.11/site-packages/cytotable/sources.py:330, in _gather_sources(source_path, source_datatype, targets, **kwargs)
327 built_path = _build_path(path=source_path, **kwargs)
329 # gather filepaths which will be used as the basis for this work
--> 330 sources = _get_source_filepaths(
331 path=built_path, targets=targets, source_datatype=source_datatype
332 )
334 # infer or validate the source datatype based on source filepaths
335 source_datatype = _infer_source_datatype(
336 sources=sources, source_datatype=source_datatype
337 )
File ~/.cache/pypoetry/virtualenvs/cosmicqc-_xpxio4a-py3.11/lib/python3.11/site-packages/cytotable/sources.py:79, in _get_source_filepaths(path, targets, source_datatype)
74 raise DatatypeException(
75 "A source_datatype must be specified when using undefined compartments and metadata names."
76 )
78 # gathers files from provided path using compartments + metadata as a filter
---> 79 sources = [
80 # build source_paths for all files
81 # note: builds local cache for sqlite files from cloud
82 {"source_path": _cache_cloudpath_to_local(subpath)}
83 # loop for navigating single file or subpaths
84 for subpath in (
85 (path,)
86 # used if the source path is a single file
87 if path.is_file()
88 # iterates through a source directory
89 else (x for x in path.glob("**/*") if x.is_file())
90 )
91 # ensure the subpaths meet certain specifications
92 if (
93 targets is None
94 or targets == []
95 # checks for name of the file from targets (compartment + metadata names)
96 or str(subpath.stem).lower() in [target.lower() for target in targets]
97 # checks for sqlite extension (which may include compartment + metadata names)
98 or subpath.suffix.lower() == ".sqlite"
99 )
100 ]
102 # expand sources to include sqlite tables similarly to files (one entry per table)
103 expanded_sources = []
File ~/.cache/pypoetry/virtualenvs/cosmicqc-_xpxio4a-py3.11/lib/python3.11/site-packages/cytotable/sources.py:82, in <listcomp>(.0)
74 raise DatatypeException(
75 "A source_datatype must be specified when using undefined compartments and metadata names."
76 )
78 # gathers files from provided path using compartments + metadata as a filter
79 sources = [
80 # build source_paths for all files
81 # note: builds local cache for sqlite files from cloud
---> 82 {"source_path": _cache_cloudpath_to_local(subpath)}
83 # loop for navigating single file or subpaths
84 for subpath in (
85 (path,)
86 # used if the source path is a single file
87 if path.is_file()
88 # iterates through a source directory
89 else (x for x in path.glob("**/*") if x.is_file())
90 )
91 # ensure the subpaths meet certain specifications
92 if (
93 targets is None
94 or targets == []
95 # checks for name of the file from targets (compartment + metadata names)
96 or str(subpath.stem).lower() in [target.lower() for target in targets]
97 # checks for sqlite extension (which may include compartment + metadata names)
98 or subpath.suffix.lower() == ".sqlite"
99 )
100 ]
102 # expand sources to include sqlite tables similarly to files (one entry per table)
103 expanded_sources = []
File ~/.cache/pypoetry/virtualenvs/cosmicqc-_xpxio4a-py3.11/lib/python3.11/site-packages/cytotable/utils.py:323, in _cache_cloudpath_to_local(path)
315 if (
316 isinstance(path, CloudPath)
317 and path.is_file()
318 and path.suffix.lower() == ".sqlite"
319 ):
320 try:
321 # update the path to be the local filepath for reference in CytoTable ops
322 # note: incurs a data read which will trigger caching of the file
--> 323 path = pathlib.Path(path.fspath)
324 except InvalidPrefixError:
325 # share information about not finding a cloud path
326 logger.info(
327 "Did not detect a cloud path based on prefix. Defaulting to use local path operations."
328 )
File ~/.cache/pypoetry/virtualenvs/cosmicqc-_xpxio4a-py3.11/lib/python3.11/site-packages/cloudpathlib/cloudpath.py:412, in CloudPath.fspath(self)
410 @property
411 def fspath(self) -> str:
--> 412 return self.__fspath__()
File ~/.cache/pypoetry/virtualenvs/cosmicqc-_xpxio4a-py3.11/lib/python3.11/site-packages/cloudpathlib/cloudpath.py:312, in CloudPath.__fspath__(self)
310 def __fspath__(self) -> str:
311 if self.is_file():
--> 312 self._refresh_cache(force_overwrite_from_cloud=False)
313 return str(self._local)
File ~/.cache/pypoetry/virtualenvs/cosmicqc-_xpxio4a-py3.11/lib/python3.11/site-packages/cloudpathlib/cloudpath.py:1121, in CloudPath._refresh_cache(self, force_overwrite_from_cloud)
1114 if (
1115 not self._local.exists()
1116 or (self._local.stat().st_mtime < stats.st_mtime)
1117 or force_overwrite_from_cloud
1118 ):
1119 # ensure there is a home for the file
1120 self._local.parent.mkdir(parents=True, exist_ok=True)
-> 1121 self.download_to(self._local)
1123 # force cache time to match cloud times
1124 os.utime(self._local, times=(stats.st_mtime, stats.st_mtime))
File ~/.cache/pypoetry/virtualenvs/cosmicqc-_xpxio4a-py3.11/lib/python3.11/site-packages/cloudpathlib/cloudpath.py:900, in CloudPath.download_to(self, destination)
898 if destination.is_dir():
899 destination = destination / self.name
--> 900 return self.client._download_file(self, destination)
901 else:
902 destination.mkdir(exist_ok=True)
File ~/.cache/pypoetry/virtualenvs/cosmicqc-_xpxio4a-py3.11/lib/python3.11/site-packages/cloudpathlib/s3/s3client.py:155, in S3Client._download_file(self, cloud_path, local_path)
152 local_path = Path(local_path)
153 obj = self.s3.Object(cloud_path.bucket, cloud_path.key)
--> 155 obj.download_file(
156 str(local_path), Config=self.boto3_transfer_config, ExtraArgs=self.boto3_dl_extra_args
157 )
158 return local_path
File ~/.cache/pypoetry/virtualenvs/cosmicqc-_xpxio4a-py3.11/lib/python3.11/site-packages/boto3/s3/inject.py:361, in object_download_file(self, Filename, ExtraArgs, Callback, Config)
330 def object_download_file(
331 self, Filename, ExtraArgs=None, Callback=None, Config=None
332 ):
333 """Download an S3 object to a file.
334
335 Usage::
(...)
359 transfer.
360 """
--> 361 return self.meta.client.download_file(
362 Bucket=self.bucket_name,
363 Key=self.key,
364 Filename=Filename,
365 ExtraArgs=ExtraArgs,
366 Callback=Callback,
367 Config=Config,
368 )
File ~/.cache/pypoetry/virtualenvs/cosmicqc-_xpxio4a-py3.11/lib/python3.11/site-packages/boto3/s3/inject.py:192, in download_file(self, Bucket, Key, Filename, ExtraArgs, Callback, Config)
157 """Download an S3 object to a file.
158
159 Usage::
(...)
189 transfer.
190 """
191 with S3Transfer(self, Config) as transfer:
--> 192 return transfer.download_file(
193 bucket=Bucket,
194 key=Key,
195 filename=Filename,
196 extra_args=ExtraArgs,
197 callback=Callback,
198 )
File ~/.cache/pypoetry/virtualenvs/cosmicqc-_xpxio4a-py3.11/lib/python3.11/site-packages/boto3/s3/transfer.py:406, in S3Transfer.download_file(self, bucket, key, filename, extra_args, callback)
402 future = self._manager.download(
403 bucket, key, filename, extra_args, subscribers
404 )
405 try:
--> 406 future.result()
407 # This is for backwards compatibility where when retries are
408 # exceeded we need to throw the same error from boto3 instead of
409 # s3transfer's built in RetriesExceededError as current users are
410 # catching the boto3 one instead of the s3transfer exception to do
411 # their own retries.
412 except S3TransferRetriesExceededError as e:
File ~/.cache/pypoetry/virtualenvs/cosmicqc-_xpxio4a-py3.11/lib/python3.11/site-packages/s3transfer/futures.py:106, in TransferFuture.result(self)
104 except KeyboardInterrupt as e:
105 self.cancel()
--> 106 raise e
File ~/.cache/pypoetry/virtualenvs/cosmicqc-_xpxio4a-py3.11/lib/python3.11/site-packages/s3transfer/futures.py:103, in TransferFuture.result(self)
98 def result(self):
99 try:
100 # Usually the result() method blocks until the transfer is done,
101 # however if a KeyboardInterrupt is raised we want want to exit
102 # out of this and propagate the exception.
--> 103 return self._coordinator.result()
104 except KeyboardInterrupt as e:
105 self.cancel()
File ~/.cache/pypoetry/virtualenvs/cosmicqc-_xpxio4a-py3.11/lib/python3.11/site-packages/s3transfer/futures.py:261, in TransferCoordinator.result(self)
251 """Waits until TransferFuture is done and returns the result
252
253 If the TransferFuture succeeded, it will return the result. If the
254 TransferFuture failed, it will raise the exception associated to the
255 failure.
256 """
257 # Doing a wait() with no timeout cannot be interrupted in python2 but
258 # can be interrupted in python3 so we just wait with the largest
259 # possible value integer value, which is on the scale of billions of
260 # years...
--> 261 self._done_event.wait(MAXINT)
263 # Once done waiting, raise an exception if present or return the
264 # final result.
265 if self._exception:
File /opt/hostedtoolcache/Python/3.11.10/x64/lib/python3.11/threading.py:629, in Event.wait(self, timeout)
627 signaled = self._flag
628 if not signaled:
--> 629 signaled = self._cond.wait(timeout)
630 return signaled
File /opt/hostedtoolcache/Python/3.11.10/x64/lib/python3.11/threading.py:327, in Condition.wait(self, timeout)
325 try: # restore state no matter what (e.g., KeyboardInterrupt)
326 if timeout is None:
--> 327 waiter.acquire()
328 gotit = True
329 else:
KeyboardInterrupt:
Process merged single-cell data using coSMicQC¶
# show the first few columns for metadata column names
schema_names = parquet.read_schema(merged_single_cells).names
schema_names[:12]
['Metadata_ImageNumber',
'Image_Metadata_Row',
'Image_Metadata_Site',
'Metadata_ObjectNumber',
'Metadata_ObjectNumber_1',
'Metadata_ObjectNumber_2',
'Metadata_Plate',
'Metadata_Well',
'Image_TableNumber',
'Cytoplasm_AreaShape_Area',
'Cytoplasm_AreaShape_BoundingBoxArea',
'Cytoplasm_AreaShape_BoundingBoxMaximum_X']
# set a list of metadata columns for use throughout
metadata_cols = [
"Metadata_ImageNumber",
"Image_Metadata_Row",
"Image_Metadata_Site",
"Metadata_ObjectNumber",
"Metadata_Plate",
"Metadata_Well",
"Image_TableNumber",
]
# read only metadata columns with feature columns used for outlier detection
df_merged_single_cells = pd.read_parquet(
path=merged_single_cells,
columns=[
*metadata_cols,
"Nuclei_AreaShape_Area",
"Nuclei_AreaShape_FormFactor",
"Nuclei_AreaShape_Eccentricity",
],
)
df_merged_single_cells.head()
| Metadata_ImageNumber | Image_Metadata_Row | Image_Metadata_Site | Metadata_ObjectNumber | Metadata_Plate | Metadata_Well | Image_TableNumber | Nuclei_AreaShape_Area | Nuclei_AreaShape_FormFactor | Nuclei_AreaShape_Eccentricity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 1 | 2 | 2 | BR00117012 | A01 | 35196781680809101223867031596229856156 | 1091 | 0.895393 | 0.694154 |
| 1 | 2 | 1 | 2 | 3 | BR00117012 | A01 | 35196781680809101223867031596229856156 | 1007 | 0.837631 | 0.819062 |
| 2 | 2 | 1 | 2 | 4 | BR00117012 | A01 | 35196781680809101223867031596229856156 | 1368 | 0.833197 | 0.820257 |
| 3 | 2 | 1 | 2 | 5 | BR00117012 | A01 | 35196781680809101223867031596229856156 | 847 | 0.902995 | 0.345575 |
| 4 | 2 | 1 | 2 | 6 | BR00117012 | A01 | 35196781680809101223867031596229856156 | 983 | 0.863005 | 0.742060 |
# label outliers within the dataset
print("Large nuclei outliers:")
df_labeled_outliers = cosmicqc.analyze.find_outliers(
df=df_merged_single_cells,
metadata_columns=metadata_cols,
feature_thresholds="large_nuclei",
)
Large nuclei outliers:
Number of outliers: 1355 (0.48%)
Outliers Range:
Nuclei_AreaShape_Area Min: 1754
Nuclei_AreaShape_Area Max: 4414
Nuclei_AreaShape_FormFactor Min: 0.3367261940249281
Nuclei_AreaShape_FormFactor Max: 0.7140072671383899
# label outliers within the dataset
print("Elongated nuclei outliers:")
df_labeled_outliers = cosmicqc.analyze.find_outliers(
df=df_merged_single_cells,
metadata_columns=metadata_cols,
feature_thresholds="elongated_nuclei",
)
Elongated nuclei outliers:
Number of outliers: 15 (0.01%)
Outliers Range:
Nuclei_AreaShape_Eccentricity Min: 0.9868108584805481
Nuclei_AreaShape_Eccentricity Max: 0.9995098494433163
# label outliers within the dataset
print("Small and low formfactor nuclei outliers:")
df_labeled_outliers = cosmicqc.analyze.find_outliers(
df=df_merged_single_cells,
metadata_columns=metadata_cols,
feature_thresholds="small_and_low_formfactor_nuclei",
)
Small and low formfactor nuclei outliers:
Number of outliers: 6548 (2.34%)
Outliers Range:
Nuclei_AreaShape_Area Min: 79
Nuclei_AreaShape_Area Max: 744
Nuclei_AreaShape_FormFactor Min: 0.0945907341645769
Nuclei_AreaShape_FormFactor Max: 0.7781815132858318
# label outliers within the dataset
df_labeled_outliers = cosmicqc.analyze.label_outliers(
df=df_merged_single_cells,
include_threshold_scores=True,
)
# show added columns
df_labeled_outliers[
[col for col in df_labeled_outliers.columns.tolist() if "cqc." in col]
].head()
| cqc.small_and_low_formfactor_nuclei.Z_Score.Nuclei_AreaShape_Area | cqc.small_and_low_formfactor_nuclei.Z_Score.Nuclei_AreaShape_FormFactor | cqc.small_and_low_formfactor_nuclei.is_outlier | cqc.elongated_nuclei.Z_Score.Nuclei_AreaShape_Eccentricity | cqc.elongated_nuclei.is_outlier | cqc.large_nuclei.Z_Score.Nuclei_AreaShape_Area | cqc.large_nuclei.Z_Score.Nuclei_AreaShape_FormFactor | cqc.large_nuclei.is_outlier | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.030121 | 0.826248 | False | -0.154094 | False | 0.030121 | 0.826248 | False |
| 1 | -0.219592 | -0.073800 | False | 0.765830 | False | -0.219592 | -0.073800 | False |
| 2 | 0.853580 | -0.142903 | False | 0.774634 | False | 0.853580 | -0.142903 | False |
| 3 | -0.695236 | 0.944704 | False | -2.721308 | False | -0.695236 | 0.944704 | False |
| 4 | -0.290938 | 0.321578 | False | 0.198723 | False | -0.290938 | 0.321578 | False |
# create a column which indicates whether an erroneous outlier was detected
# from all cosmicqc outlier threshold sets. For ex. True for is_outlier in
# one threshold set out of three would show True for this column. False for
# is_outlier in all threshold sets would show False for this column.
df_labeled_outliers["analysis.included_at_least_one_outlier"] = df_labeled_outliers[
[col for col in df_labeled_outliers.columns.tolist() if ".is_outlier" in col]
].any(axis=1)
# show value counts for all outliers
outliers_counts = df_labeled_outliers[
"analysis.included_at_least_one_outlier"
].value_counts()
outliers_counts
analysis.included_at_least_one_outlier
False 271883
True 7906
Name: count, dtype: int64
# show the percentage of total dataset
print(
(outliers_counts.iloc[1] / outliers_counts.iloc[0]) * 100,
"%",
"of",
outliers_counts.iloc[0],
"include erroneous outliers of some kind.",
)
2.9078684581235312 % of 271883 include erroneous outliers of some kind.
# show histograms to help visualize the data
df_labeled_outliers.show_report(); # fmt: skip